# import libraries
# ================
# for date and time opeations
from datetime import datetime
# for file and folder operations
import os
# for regular expression opeations
import re
# for listing files in a folder
import glob
# for getting web contents
import requests
# storing and analysing data
import pandas as pd
# for scraping web contents
from bs4 import BeautifulSoup
import re
# get data
# ========
# link at which web data recides
link = 'https://www.mohfw.gov.in/'
# get web data
req = requests.get(link)
# parse web data
soup = BeautifulSoup(req.content, "html.parser")
# find the table
# ==============
# our target table is the last table in the page
# get the table head
# table head may contain the column names, titles, subtitles
thead = soup.find_all('thead')[-1]
# print(thead)
# get all the rows in table head
# it usually have only one row, which has the column names
head = thead.find_all('tr')
# print(head)
# get the table tbody
# it contains the contents
tbody = soup.find_all('tbody')[-1]
# print(tbody)
# get all the rows in table body
# each row is each state's entry
body = tbody.find_all('tr')
# print(body)
# get the table contents
# ======================
# container for header rows / column title
head_rows = []
# container for table body / contents
body_rows = []
# loop through the head and append each row to head
for tr in head:
td = tr.find_all(['th', 'td'])
row = [i.text for i in td]
head_rows.append(row)
# print(head_rows)
# loop through the body and append each row to body
for tr in body:
td = tr.find_all(['th', 'td'])
row = [i.text for i in td]
body_rows.append(row)
# print(head_rows)
# save contents in a dataframe
# ============================
# skip last 3 rows, it contains unwanted info
# head_rows contains column title
df_bs = pd.DataFrame(body_rows[:len(body_rows)-6],
columns=head_rows[0])
# Drop 'S. No.' column
df_bs.drop('S. No.', axis=1, inplace=True)
# there are 36 states+UT in India
df_bs.head(36)
df_bs=df_bs.rename(columns={"Name of State / UT": "State"}, errors="raise")
# date-time information
# =====================
# today's date
now = datetime.now()
# format date to month-day-year
df_bs['Date'] = now.strftime("%m/%d/%Y")
# add 'Date' column to dataframe
df_bs['Date'] = pd.to_datetime(df_bs['Date'], format='%m/%d/%Y')
# df_bs.head(36)
# remove extra characters from 'Name of State/UT' column
df_bs['State'] = df_bs['State'].str.replace('#', '')
# latitude and longitude information
# ==================================
# latitude of the states
lat = {'Delhi':28.7041, 'Haryana':29.0588, 'Kerala':10.8505, 'Rajasthan':27.0238,
'Telengana':18.1124, 'Uttar Pradesh':26.8467, 'Ladakh':34.2996, 'Tamil Nadu':11.1271,
'Jammu and Kashmir':33.7782, 'Punjab':31.1471, 'Karnataka':15.3173, 'Maharashtra':19.7515,
'Andhra Pradesh':15.9129, 'Odisha':20.9517, 'Uttarakhand':30.0668, 'West Bengal':22.9868,
'Puducherry': 11.9416, 'Chandigarh': 30.7333, 'Chhattisgarh':21.2787, 'Gujarat': 22.2587,
'Himachal Pradesh': 31.1048, 'Madhya Pradesh': 22.9734, 'Bihar': 25.0961, 'Manipur':24.6637,
'Mizoram':23.1645, 'Goa': 15.2993, 'Andaman and Nicobar Islands': 11.7401, 'Assam' : 26.2006,
'Jharkhand': 23.6102, 'Arunachal Pradesh': 28.2180, 'Tripura': 23.9408, 'Nagaland': 26.1584,
'Meghalaya' : 25.4670, 'Dadar Nagar Haveli' : 20.1809, 'Sikkim':27.5330}
# longitude of the states
long = {'Delhi':77.1025, 'Haryana':76.0856, 'Kerala':76.2711, 'Rajasthan':74.2179,
'Telengana':79.0193, 'Uttar Pradesh':80.9462, 'Ladakh':78.2932, 'Tamil Nadu':78.6569,
'Jammu and Kashmir':76.5762, 'Punjab':75.3412, 'Karnataka':75.7139, 'Maharashtra':75.7139,
'Andhra Pradesh':79.7400, 'Odisha':85.0985, 'Uttarakhand':79.0193, 'West Bengal':87.8550,
'Puducherry': 79.8083, 'Chandigarh': 76.7794, 'Chhattisgarh':81.8661, 'Gujarat': 71.1924,
'Himachal Pradesh': 77.1734, 'Madhya Pradesh': 78.6569, 'Bihar': 85.3131, 'Manipur':93.9063,
'Mizoram':92.9376, 'Goa': 74.1240, 'Andaman and Nicobar Islands': 92.6586, 'Assam' : 92.9376,
'Jharkhand': 85.2799, 'Arunachal Pradesh': 94.7278, 'Tripura': 91.9882, 'Nagaland': 94.5624,
'Meghalaya' : 91.3662, 'Dadar Nagar Haveli' : 73.0169, 'Sikkim':88.5122}
# add latitude column based on 'Name of State / UT' column
df_bs['Latitude'] = df_bs['State'].map(lat)
# add longitude column based on 'Name of State / UT' column
df_bs['Longitude'] = df_bs['State'].map(long)
# read data
# data about number of goverment hospitals and beds in hospital
##df1 = pd.read_csv('https://raw.githubusercontent.com/souroy12/Hospital-Bed-Analysis/master/Number%20of%20Government%20Hospitals%20and%20Beds%20in%20Rural%20and%20Urban%20Areas.csv')
# read data
# data about number of goverment hospitals and beds in hospital
df1 = pd.read_csv('C://Users/skoul2/AnacondaProjects/Covid/datasets_Number of Hospitals and Beds in Public and Private Areas .csv')
combined_df=pd.merge(df_bs, df1, how='left')
combined_df = combined_df.drop(17)
combined_df=combined_df.rename(columns={"Deaths**": "Deaths"}, errors="raise")
combined_df=combined_df.rename(columns={"Total Confirmed cases*": "Confirmed"}, errors="raise")
combined_df=combined_df.rename(columns={"Cured/Discharged/Migrated*": "Cured_Discharged_Migrated"}, errors="raise")
combined_df=combined_df.rename(columns={"Active Cases*": "Active"}, errors="raise")
#convert data types of numerical values into int
combined_df[['Active','Deaths','Confirmed','Cured_Discharged_Migrated']]=combined_df[['Active','Deaths','Confirmed','Cured_Discharged_Migrated']].astype(int)
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
print(color.BOLD,color.RED,color.UNDERLINE + 'Current Situation in India according www.mohfw.gov.in and this code will pull the latest data from this site through Web scraping' + color.END)
print('3D Statewise view for Cured/Discharged/Migrated, Deaths and Confirmed')
import plotly.express as px
import numpy as np
fig = px.scatter_3d(combined_df, x='Cured_Discharged_Migrated', y='Active', z='Deaths',size='Confirmed', color='State')
fig.update_layout(height=800, width=800,scene_zaxis_type="log",scene_yaxis_type="log",scene_xaxis_type="log")
fig.show()
import folium
indiacovid = folium.Map(location=[20.5937,78.9629], zoom_start=5.4)
for lat, lon,State,Death,Total_confirmed_cases in zip(combined_df['Latitude'], combined_df['Longitude'],combined_df['State'],combined_df['Deaths'],combined_df['Confirmed']):
folium.CircleMarker([lat, lon],
radius=5,
color='Green',
popup =('State:' + str(State) + '<br>'
'Total Confirmed cases:' + str(Total_confirmed_cases) + '<br>',
'Death :' + str(Death) +'<br>'
),
fill_color='Yellow',
fill_opacity=0.7 ).add_to(indiacovid)
folium
print('Geographical & holistic view of patients and beds availability across hospitals')
##import folium
###indiacovid = folium.Map(location=[20.5937,78.9629], zoom_start=5.4)
for i in range(0, len(combined_df)):
folium.Circle(
location=[combined_df.iloc[i]['Latitude'], combined_df.iloc[i]['Longitude']],
color='crimson',
tooltip = '<li><bold>State : '+str(combined_df.iloc[i]['State'])+
'<li><bold>Total Beds available : '+str(combined_df.iloc[i]['total_beds'])+
'<li><bold>Total Beds available in private hospital : '+str(combined_df.iloc[i]['beds_private_sector'])+
'<li><bold>Total Beds available in Public hospital : '+str(combined_df.iloc[i]['beds_ public_sector'])+
'<li><bold>Total hospitals : '+str(combined_df.iloc[i]['total_hospitals'])+
'<li><bold>Total Public hospitals : '+str(combined_df.iloc[i]['hospitals_ public_sector'])+
'<li><bold>Total Private hospitals : '+str(combined_df.iloc[i]['hospitals_ private_sector'])+
'<li><bold>Active cases : '+str(combined_df.iloc[i]['Active'])+
'<li><bold>Deaths : '+str(combined_df.iloc[i]['Deaths'])+
'<li><bold>Cured_Discharged_Migrated : '+str(combined_df.iloc[i]['Cured_Discharged_Migrated'])+
'<li><bold>Total Confirmed : '+str(combined_df.iloc[i]['Confirmed']),
radius=int(combined_df.iloc[i]['Confirmed'])**1.1).add_to(indiacovid)
indiacovid
print('Total Deaths,Active,Cured/Discharged/Migrated as per each state')
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(10,10))
plt.barh(combined_df['State'],combined_df['Deaths'], label = 'Deaths')
plt.barh(combined_df['State'],combined_df['Active'], label = 'Total Active')
plt.barh(combined_df['State'],combined_df['Cured_Discharged_Migrated'], label = ' Total Cured/Discharged/Migrated')
plt.ylabel('States')
plt.xlabel('Confirmed')
plt.legend()
plt.show()
###Features. Select features
##Dates. Filter train data from 2020-03-01 to 2020-03-18
##Log transformation. Apply log transformation to ConfirmedCases and Fatalities
##Infinites. Replace infinites from the logarithm with 0. Given the asymptotic behavior of the logarithm for log(0),this implies that when applying the inverse transformation (exponential) a 1 will be returned instead of a 0. This problem does not impact many countries, but still needs to be tackled sooner or later in order to obtain a clean solution.
###Train/test split. Split into train/valid/test
###Prediction. Linear Regression, training country by country and joining data
###Submit. Submit results in the correct format, and applying exponential to reverse log transformation
combined_df_ml=combined_df
#Overall
import plotly.graph_objs as go
ac= combined_df['Active'].sum()
rvd = combined_df['Cured_Discharged_Migrated'].sum()
dth = combined_df['Deaths'].sum()
fig = go.Figure(data=[go.Pie(labels=['Active','Cured','Death'],
values= [ac,rvd,dth],hole =.3)])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
marker=dict(colors=['#263fa3', '#2fcc41','#cc3c2f'], line=dict(color='#FFFFFF', width=2)))
fig.update_layout(title_text='Total overall cases',plot_bgcolor='rgb(275, 270, 273)')
fig.show()
fig = plt.figure(figsize=(10,10))
conf_per_country = combined_df.groupby('State')['Confirmed'].sum().sort_values(ascending=False)
conf_sum=combined_df['Confirmed'].sum()
def absolute_value(val):
a = val
return (np.round(a,2))
conf_per_country.plot(kind="pie",title='Percentage of confirmed cases per state',autopct=absolute_value)
plt.show ()
df2=combined_df.groupby('State')[['Cured_Discharged_Migrated','Deaths','Confirmed']].sum()
df2=df2.nlargest(20,'Confirmed')
plt.figure(figsize=(20,10))
plt.title('top 20 states with confirmed cases',fontsize=30)
plt.xticks(rotation=90,fontsize=20)
plt.yticks(fontsize=20)
plt.xlabel('State',fontsize=20)
plt.ylabel('Cases',fontsize=20)
plt.plot(df2.index,df2.Confirmed,marker='o',mfc='black',label='Confirmed',markersize=10,linewidth=5)
plt.plot(df2.index,df2.Deaths,marker='o',mfc='black',label='Deaths',markersize=10,linewidth=5)
plt.plot(df2.index,df2.Cured_Discharged_Migrated,marker='o',mfc='black',label='Cured_Discharged_Migrated',markersize=10,linewidth=5,color='green')
plt.legend(fontsize=20)
# read data
# data about number of goverment hospitals and beds in hospital
##df1 = pd.read_csv('https://raw.githubusercontent.com/souroy12/Hospital-Bed-Analysis/master/Number%20of%20Government%20Hospitals%20and%20Beds%20in%20Rural%20and%20Urban%20Areas.csv')
# read data
# data about number of goverment hospitals and beds in hospital
combined_df_ml = pd.read_csv('C://Users/skoul2/AnacondaProjects/Covid/datasets_covid_19_india.csv')
#country_data['GrowthFactor'] = growth_factor(country_data['Confirmed'])
from scipy import optimize
# we will want x_data to be the number of days since first confirmed and the y_data to be the confirmed data. This will be the data we use to fit a logistic curve
x_data = range(len(combined_df_ml.index))
y_data = combined_df_ml['Confirmed']
def log_curve(x, k, x_0, ymax):
return ymax / (1 + np.exp(-k*(x-x_0)))
# Fit the curve
popt, pcov = optimize.curve_fit(log_curve, x_data, y_data, bounds=([0,0,0],np.inf), maxfev=50000)
estimated_k, estimated_x_0, ymax= popt
# Plot the fitted curve
k = estimated_k
x_0 = estimated_x_0
y_fitted = log_curve(range(0,160), k, x_0, ymax)
print(k, x_0, ymax)
#print(y_fitted)
##y_data.tail()
print('Growthfactor for confirmed cases')
# Plot everything for illustration
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(range(0,160), y_fitted, '--', label='fitted')
ax.plot(x_data, y_data, 'o', label='Confirmed Data')
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()
combined_df_ml['State/UnionTerritory']=lbl.fit_transform(combined_df_ml['State/UnionTerritory'])
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import datetime
from datetime import datetime
from sklearn.metrics import accuracy_score
from sklearn import metrics
x=combined_df_ml[['Deaths',]]
y=combined_df_ml['Confirmed']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)
# Predicting the Test set results
y_pred = regressor.predict(x_test)
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured Covid Cases ')
ax.set_ylabel('Predicted Covid Cases')
plt.title('Measured Vs Predicted Covid-19 cases')
plt.show()
print('Predicted Covid-19 cases per dealth')